package com.waya.demo.test;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class ReadHTMLByUrl {

	/**
	 * url读取html文件
	 * 
	 * @param u
	 * @param encoding
	 * @return
	 * @throws Exception
	 */
	public static String readFile(String u, String encoding) throws Exception {
		StringBuffer html = new StringBuffer();
		URL url = new URL(u);// 根据链接（字符串格式），生成一个URL对象

		HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();// 打开URL
		BufferedReader reader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), encoding));// 得到输入流，即获得了网页的内容
		String line; // 读取输入流的数据，并显示
		while ((line = reader.readLine()) != null) {
			html.append(line);
		}
		return html.toString();
	}

	/**
	 * 按照url从网络上直接读取html下body的内容
	 * 
	 * @param url
	 * @return
	 * @throws IOException
	 */
	public static String JsoupBodyHtml(String url) {
		Document doc = null;
		try {
			doc = Jsoup.connect(url).get();
		} catch (IOException e) {
			for (int i = 0; i < 3; i++) {
				try {
					Thread.sleep(5 * 1000);
					doc = Jsoup.connect(url).get();
					// 成功建立连接跳出循环
					break;
				} catch (IOException e1) {
				} catch (InterruptedException e1) {
					e1.printStackTrace();
				}
			}
		}
		return doc == null ? "" : doc.body() == null ? "" : doc.body().html();

	}

	/**
	 * 按照url从网络上直接读取html下的内容
	 * 
	 * @param url
	 * @return
	 * @throws IOException
	 */
	public static String JsoupHtml(String url) {
		Document doc = null;
		try {
			doc = Jsoup.connect(url).get();
		} catch (IOException e) {
			try {
				Thread.sleep(20 * 1000);
				doc = Jsoup.connect(url).get();
			} catch (IOException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			} catch (InterruptedException e1) {
				e1.printStackTrace();
			}
			e.printStackTrace();
		}
		return doc == null ? "" : doc.toString();

	}

	/**
	 * 根据网页的url清空带有字体的style属性，替换img的src
	 * 
	 * @param url
	 * @return
	 */
	public static String reWriteHtml(String url) {
		try {
			// 截取url
			String domainUrl = url.substring(0, url.lastIndexOf("/") + 1);
			Document doc = Jsoup.connect(url).get();
			Element body = doc.body();
			// 查找img
			List<Element> imgs = body.select("img");
			if (null == imgs) {
				imgs = new ArrayList<>();
			}
			String src;
			for (Element img : imgs) {
				src = img.attr("src");
				// 图片相对路径改为绝对路径
				src = src.startsWith("http") ? src : domainUrl + src;
				img.attr("src", src);
			}
			List<Element> eList = body.getAllElements();
			if (null == eList) {
				eList = new ArrayList<>();
			}
			String style;
			// 移除带有font属性的样式
			for (Element e : eList) {
				style = e.attr("style");
				if (style.indexOf("font") > -1) {
					e.removeAttr("style");
				}
			}
			return body.html();
		} catch (Exception e) {
			e.printStackTrace();
			return "";
		}
	}

	/**
	 * 将img标签中的src进行二次包装
	 * 
	 * @param content     内容
	 * @param replaceHttp 需要在src中加入的域名
	 * @return
	 */
	public static String repairContent(String url) {
		String content = JsoupBodyHtml(url);
		if ("".equals(content)) {
			return "";
		}
		// 加在img src中的前缀
		String replaceHttp = url.substring(0, url.lastIndexOf("/") + 1);
		String patternStr = "<img\\s*([^>]*)\\s*src=\\\"(.*?)\\\"\\s*([^>]*)>";

		content = replSrc(content, replaceHttp, patternStr);
		return content;
	}

	/**
	 * 替换src后的div内容
	 * 
	 * @param content
	 * @param replaceHttp
	 * @param patternStr
	 * @return
	 */
	private static String replSrc(String content, String replaceHttp, String patternStr) {
		Pattern pattern = Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(content);
		Map<String, String> repMap = new HashMap<String, String>();
		// 将所有的匹配数据放到repMap中去重，防止重复替换
		while (matcher.find()) {
			String src = matcher.group(2);
			String replaceSrc = "";
			// 只替换非 http:// 或 https:// 的src
			if (!src.startsWith("http://") && !src.startsWith("https://") && !src.isEmpty()) {
				replaceSrc = replaceHttp + src;
				// 按key去重
				if (!repMap.containsKey(src)) {
					repMap.put(src, replaceSrc);
				}
			}
		}
		// 去重替换文件
		for (String key : repMap.keySet()) {
			content = content.replace(key, repMap.get(key));
		}
		return content;
	}

	/**
	 * 获得要替换的map
	 * 
	 * @param content
	 * @param replaceHttp
	 * @param pattern
	 * @param matcher
	 * @return
	 */
	private static String matchSrc(String content, String replaceHttp, Pattern pattern, Matcher matcher) {
		Map<String, String> repMap = new HashMap<String, String>();
		// 将所有的匹配数据放到repMap中去重，防止重复替换
		while (matcher.find()) {
			String src = matcher.group(2);
			String replaceSrc = "";
			if (!src.startsWith("http://") && !src.startsWith("https://")) {
				replaceSrc = replaceHttp + src;
				if (!repMap.containsKey(src)) {
					repMap.put(src, replaceSrc);
				}
			}
		}
		// 按key去重
		for (String key : repMap.keySet()) {
			content = content.replace(key, repMap.get(key));
		}
		return content;
	}

	private static List<String> getMatchers(String regex, String source) {
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(source);
		List<String> list = new ArrayList<String>();
		while (matcher.find()) {
			list.add(matcher.group(2));
		}
		return list;
	}
}
